#This file shows how the topic model was run.  
#While we cannot make the data publically available, 
#please get in contact with us if you would like to use it for research.

#Load previously segmented data
json <- readLines("admin_lit.txt")
segd <- jsonlite::fromJSON(json, simplifyDataFrame = TRUE)
names(segd)[names(segd)=="facts"] <- "Facts"
names(segd)[names(segd)=="holding"] <- "Holding"
names(segd)[names(segd)=="decision"] <- "Decision"
names(segd)[names(segd)=="parties"] <- "Parties"
printtext <- printText(segd)
text <- extractText(segd)
parties <- extractParties(segd)
division <- extractDivision(segd)
court <- extractCourt(segd)
id <- extractCaseID(segd)
converted_date <- extractDate(segd)
merged <- data.frame(fileid=segd$file, textseg=text, parties=parties, 
                     division=division, court=court, id=id, converted_date=converted_date)

#Remove enforcement cases that remain in the data.

#looking for "申请人" or “被申请人"
shenqing <- str_detect(merged$parties, "申请人")

#looking for "审" in court division field
shen <- str_detect(merged$division, "审")

#looking for cases with "民行执" in the court and court_division fields in case id
min <- str_detect(merged$id, "民行执") 

#Run the STM
library(stm)

#Remove cases with 审 in division
merged <- merged[!((shenqing & shen) | (min & shenqing)),]
#Take out place names
merged$textnopl <- gsub("\\S+[市|县|区|省|中心|乡|村|镇]\\s", "", merged$textseg)
merged$textnopl <- gsub("\\S+[法院]", "", merged$textnopl)

#Process files
merged <- as.data.frame(merged)
processed <- textProcessor(merged$textnopl, merged, wordLengths = c(2, Inf))
out <- prepDocuments(processed$documents, processed$vocab, 
                     processed$meta, lower.thresh=50, 
                     upper.thresh=length(processed$documents)/2)
stm.out <- stm(out$documents, out$vocab, K=0, 
               data=out$meta, init.type="Spectral")
